/*** GENERATES SAMPLE OF STUDENTS WITH 2011-2019 TRANSCRIPT AND ATTENDANCE RECORDS ***/

set more 1
clear all

do "paths.do"

capture log close
log using "$LOGFILES\tracking_2.log", replace

/*** Variables used in this do-file:

ID1, ID2, invalid id flags
grade: "EE","PK","KG"and 01-12
camptype: 
  0-campus assigned non-enrolled
  1-instructional campus
  2-alternative instructional unit
  3-budgeted campus
  4-private school
  5-summer school
  6-district administration
  7-shared servces arrange admin
  8-juvenile justice alt ed pgm
  9-open enrollment charter school
  A-daep only campus
PEIMS_HIGHGRADE: highest grade reported at the campus for enrollment
ORG_HIGHGRADE: highest grade offered at campus according to TEA's Org database
campstat: indicates whether campus is:
  A-active  D-dormant  O-obsolete  U-under construction
PEIMS_GRADEGRP1: whether campus is considered an elementary, middle, junior high, high,
 elementary/secondary or other grade group campus (in PEIMS data)
ORG_GRADEGRP1: similar to PEIMS_GRADEGRP1 (but derived from Organization database)
REG_ALT_ACCNT: indicates whether the campus is registered for Alternative Accountability 
CAMP_CHARTTYPE: indicates whether the campus is
 00-not a charter school
 01-member of open enrollment charter school district that anticipates <75% at risk students
 02-member of open enrollment charter school district that anticipates 75% at risk
 03-campus-level charter
 04-member of an open enrollment college or university charter district
setting: see do file named "setting_label_disability.do"

An instructional campus: 1) has an assigned administrator, 2) has enrolled students who are 
counted for ADA 3) has assigned instructional staff, 4) receives federal, state and/or local
funds as its primary support, 5) provides instruction in the Texas Essential Knowledge and
Skills (TEKS), 6) has one or more grade groups from early education through grade 12, and
7) is not a program for students enrolled in another public school

An alternative instructional unit: 1) is an instructional site, center, program, or
arrangement that is not governed by an individual campus organization, and 2) does not
meet the above definition for a standard instructional campus
***/


*************************************************
*** Identify students in the transcript files ***
*************************************************

** only keep individual and campus ids
foreach num of numlist 11(1)19{

	use id2 ID1 campus using "$indata\p_course_complete`num'", clear
	duplicates drop
	gen year=20`num'
	
	** identify and drop if IDs are conflicting
	rename ID1 id1
	* note: tag is set to 0 if id1 or id2 missing
	egen tmp0=tag(id1 id2 year)
	egen tmp1=total(tmp0) if id2!="", by(id2 year)
	egen tmp2=total(tmp0) if id1!="", by(id1 year)
	tab tmp1 tmp2, m cell
	drop if (tmp1>1 & tmp1!=.)|(tmp2>1 & tmp2!=.)
	* fill in missing values for ids when can
	* note: id1 is never missing
	assert id1!=""
	count if id2==""
	egen tmp3=mode(id2) if id1!="", by(id1 year)
	replace id2=tmp3 if id2==""
	* drop new duplicates created (if any)
	drop tmp*
	duplicates drop

	** unit of observation is student x campus
	assert campus!=""
	duplicates tag id1 id2 campus year, gen(tmp1)
	duplicates tag id1 id2 year, gen(tmp2)
	assert tmp1==0
	tab tmp2
	drop tmp*
	rename id2 id2_trns
	descr
	save "$WORKING\temp\tracking_2_A`num'", replace
	di "$S_DATE $S_TIME `num': 99"
	
}

foreach num of numlist 11(1)19{
	if (`num' == 11) {
		use "$WORKING\temp\tracking_2_A`num'", clear
	}
	else {
		append using "$WORKING\temp\tracking_2_A`num'"
	}
}

save "$WORKING/temp/tracking_2_A", replace


************************************************
*** Identify students in the attendance file ***
************************************************

/** Note: we use the attendance data for 6-week periods rather than the annual
 summary files (p_attend_demog*) since those identify accountable 
 campuses rather than campuses attended. If a student changes schools within
 a district, all attendance days are allocated to the school that is
 determined to be accountable for the student for assigning school ratings. **/

** combining records across years

foreach num of numlist 11(1)19{

	use "$indata\p_attend_student`num'", clear
	gen year=2000 + `num'
	if (`num' <= 13) {
		rename INVALID_ID1_FLAG invalid_id1_flag
		rename STATE_ASSIGNED_FLAG state_assigned_flag 
	}

	** identify and drop if IDs are conflicting
	rename ID1 id1
	egen tmp0 = tag(id1 id2 year)
	egen tmp1=total(tmp0) if id2!="", by(id2 year)
	egen tmp2=total(tmp0) if id1!="", by(id1 year)
	tab tmp1 tmp2, m cell
	drop if (tmp1>1 & tmp1!=.)|(tmp2>1 & tmp2!=.)
	* fill in missing values for ids when can
	* note: id1 is never missing
	assert id1!=""
	count if id2==""
	egen tmp3=mode(id2) if id1!="", by(id1 year)
	replace id2=tmp3 if id2==""
	
	** make sure flags are constant 
	* note: invalid_id1_flag refers to id1 and state_assigned_flag refers to id2
	destring invalid_id1_flag state_assigned_flag, replace
	egen tmp4=max(invalid_id1_flag) if id1!="", by(id1 year)
	tab invalid_id1_flag tmp4, m
	assert invalid_id1_flag==tmp4
	egen tmp5=max(state_assigned_flag) if id2!="", by(id2 year)
	tab state_assigned_flag tmp5, m
	replace state_assigned_flag=tmp5
	drop tmp*

	** keep only variables needed
	rename *, lower
	descr
	keep id2 id1 invalid_id1_flag state_assigned_flag campus grade year sixweek days_present
	assert campus!="" & days_present!=. & grade!=""
	tab sixweek, m
	tab grade, m
	replace grade="-2" if grade=="EE"
	replace grade="-1" if grade=="PK"
	replace grade="0" if grade=="KG"
	destring grade, replace

	** drop records with 0 attendance
	gen tmp=(days_present==0)
	tabstat tmp, by(grade)
	drop if tmp==1
	drop tmp

	** collapse to sum by campus x grade x year x 6-week period and reshape
	duplicates tag id1 id2 campus year grade sixweek, gen(tmp)
	tab tmp
	collapse (rawsum) days_present, by (id2 id1 invalid_id1_flag state_assigned_flag campus grade year sixweek)
	reshape wide days_present, i(id2 id1 invalid_id1_flag state_assigned_flag campus grade year) j(sixweek)
	for num 1/6: recode days_presentX (.=0) 
	rename days_present* attdays*
	egen attdays=rowtotal(attdays*)

	** keep earliest record w/most days if attend multiple grades by campus x year
	duplicates tag id1 id2 campus year, gen(tmp)
	tab tmp
	* 1st observation will be sorted to be the one with most and earliest attendance days
	gsort id2 id1 campus year -attdays1 -attdays2 -attdays3 -attdays4 -attdays5 -attdays6 -grade
	drop if id2==id2[_n-1] & id1==id1[_n-1] & campus==campus[_n-1] & year==year[_n-1]
	la var grade "grade from attendance file"
	for num 1/6: la var attdaysX "days attended grading period X, by campus"
	la var attdays "total days attended all grading periods, by campus"
	drop tmp
	rename id2 id2_att
	descr

	save "$WORKING\temp\tracking_2_B`num'", replace
	di "$S_DATE $S_TIME `num': 202"
}


foreach num of numlist 11(1)19{
	if (`num' == 11) {
		use "$WORKING\temp\tracking_2_B`num'", clear
	}
	else {
		append using "$WORKING\temp\tracking_2_B`num'"
	}
}

save "$WORKING/temp/tracking_2_B", replace


************************
*** Merge both files ***
************************

** merge by student x campus x year
merge 1:1 id1 campus year using "$WORKING/temp/tracking_2_A", update

** create summary measures across multiple records
egen grade_lo=min(grade), by(id1 year)
egen grade_hi=max(grade), by(id1 year)
egen attdayst=sum(attdays), by(id1 year)
egen num_att=sum(_merge==1|_merge==3), by(id1 year)
egen num_trns=sum(_merge==2|_merge==3), by(id1 year)
egen tmp1=mode(id2_att), by(id1 year)
replace id2_att=tmp1 if id2_att==""
egen tmp2=mode(id2_trns), by(id1 year)
replace id2_trns=tmp2 if id2_trns==""
la var grade_lo "lowest grade in attendance records, across campuses"
la var grade_hi "highest grade in attendance records, across campuses"
la var attdayst "total days in att summed across campuses"
la var num_att "number of campus attendance records"
la var num_trns "number of campus transcript records"
drop tmp*

** drop grades below grade 3, since no transcripts and/or test scores
tab grade if _merge==1, m
gen tmp1=(_merge==1)
tabstat tmp1, by(grade)
drop if grade<=2
* drop transcript records if no match and highest grade below 3
drop if _merge==2 & grade_hi<=2
drop tmp*

** drop transcript records for students with no attendance records at all
tab num_att, m
drop if num_att==0

** cross-check IDs
gen tmp1=(id2_att!=id2_trns) if id2_att!="" & id2_trns!=""
tab tmp1, m 
gen id2=id2_att if tmp1!=1
replace id2=id2_trns if id2=="" & tmp1!=1
gen tmp2=id2==""
summ tmp2
drop id2_*att id2_*trns tmp*

** indicator for attendance in 1st and 3rd 6-week grading periods
descr attdays*
tab attdays1, m  
gen sixweek1=(attdays1!=0 & attdays1!=.)
egen sixweek1t=max(sixweek1), by(id1 year)
la var sixweek1 "attended the first six week period in this campus"
la var sixweek1t "attended the first six week period in any campus"      
tab attdays3, m
gen sixweek3=(attdays3!=0 & attdays3!=.)
egen sixweek3t=max(sixweek3), by(id1 year)
la var sixweek3 "attended the third six week period in this campus"
la var sixweek3t "attended the third six week period in any campus"

** courses are only recorded if enrolled at end of semester (generally 90 days)
descr attdays attdayst
gen attlow=(attdays<90)
replace attlow=. if attdays==.
gen attlowt=(attdayst<90)
replace attlowt=. if attdayst==.
la var attlow "attended this campus < 90 days"
la var attlowt "attendance across all campuses < 90 days"
tab attlow sixweek3
tab attlowt sixweek3t

** record type
/**
1:stud*camp in both att (A) and trns (T) files
2:stud*camp only in A file, but stud matched to T file for a diff camp
3:stud*camp only in A file, and stud not matched to T file for any camp though T record exists
4:stud*camp only in A file, and stud does not have a T record at all
5:stud*camp only in T file but stud matched to A file for another camp
6:stud*camp only in T file and stud not matched to A file for any camp though A record exists
**/
egen tmp1=max(_merge==3), by(id1 year)
gen rctype=1 if _merge==3
replace rctype=2 if _merge==1 & tmp1==1
replace rctype=3 if _merge==1 & tmp1==0 & num_trns>0 & num_trns!=.
replace rctype=4 if _merge==1 & tmp1==0 & num_trns==0
replace rctype=5 if _merge==2 & tmp1==1
replace rctype=6 if _merge==2 & tmp1==0 & num_att>0 & num_att!=.
tab rctype, m
for num 1/6: gen rctype_X=(rctype==X)
la var rctype "stud*camp match btwn att (A) & trns (T) records"
la var rctype_1 "stud*camp AT match"
la var rctype_2 "stud*camp A only, other AT match"
la var rctype_3 "stud*camp A only, has T but no AT match"
la var rctype_4 "stud*camp A only, no T"
la var rctype_5 "stud*camp T only, other AT match"
la var rctype_6 "stud*camp T only, has A but no AT match"
drop _merge tmp*

save "$WORKING\tracking_2", replace


******************************
*** Add campus information ***
******************************

use "$indata\p_campus11", clear
gen year=2011
append using "$indata\p_campus12"
replace year=2012 if year==.
append using "$indata\p_campus13"
replace year=2013 if year==.
append using "$indata\p_campus14"
replace year=2014 if year==.
append using "$indata\p_campus15"
replace year=2015 if year==.
append using "$indata\p_campus16"
replace year=2016 if year==.
append using "$indata\p_campus17"
replace year=2017 if year==.
append using "$indata\p_campus18"
replace year=2018 if year==.
append using "$indata\p_campus19"
replace year=2019 if year==.
drop if campus==""
rename *, lower

merge 1:m campus year using "$WORKING\tracking_2", update
for var camptype camp_charttype campstat reg_alt_accnt: tab X _merge, m
drop if _merge==1
rename reg_alt_accnt camp_alt_accnt
* note: districtID is the first 6 digits in the campus ID
gen tmp1=substr(campus,1,6)
gen tmp2=(district==tmp1) if district!="" & campus!=""
assert tmp2==1 if tmp2!=.
drop campstat_dt dtupdate date_update campstat district _merge tmp*

** grades served
tab peims_highgrade org_highgrade, m
tab peims_gradegrp1 org_gradegrp1, m
rename peims_highgrade Chigrd
rename peims_lowgrade Clogrd
rename peims_gradegrp1 Cgrdgrp
la var Chigrd "campus highest grade w/enrollment"
la var Clogrd "campus lowest grade w/enrollment"
la var Cgrdgrp "campus grade group"
drop org_lowgrade org_highgrade org_gradegrp1

** campus types
assert camptype!="" & camp_charttype!=""
tab camptype camp_charttype, m
tab camptype camp_alt_accnt, m
gen Ct_reg=(camptype=="1")
gen Ct_altrg=(camptype=="2" & camp_alt_accnt=="")
gen Ct_altsp=(camptype=="2" & camp_alt_accnt=="Y")
gen Ct_juv=(camptype=="8")
gen Ct_daep=(camptype=="A")
gen Ct_chrtd=(camp_charttype=="01")
gen Ct_chrtc=(camp_charttype=="03")
gen Ct_chrtu=(camp_charttype=="04")
la var Ct_reg "instructional campus"
la var Ct_altrg "alternative instr unit, regular acct system"
la var Ct_altsp "alternative instr unit, alt acct system"
la var Ct_juv "juvenile justice alt ed pgm"
la var Ct_daep "DAEP only campus"
la var Ct_chrtd "member of open enr chrtr dist w/<75% at risk stud"
la var Ct_chrtc "campus-level charter"
la var Ct_chrtu "member of open enr college chrtr dist"
drop camptype camp_alt_accnt camp_charttype

** student ever in disciplinary setting
gen tmp=(Ct_juv==1|Ct_daep==1)
egen Pdelinqu=max(tmp), by(id1 year)
la var Pdelinqu "has record from juvenile justice or DAEP program"
drop tmp

order id2 id1 invalid_id1_flag state_assigned_flag year Pdelinqu att* sixweek* grade* num_* rctype* campus campname C* 
summ
descr
save "$WORKING\tracking_2", replace

log close
